library(tidyverse)
library(readr)
orders <- read_csv("../data/ordersall.csv")
test <- read_csv("../data/booktest.csv")
## Warning: Missing column names filled in: 'X3' [3]
book <- read_csv("../data/book.csv")
train <- read_csv("../data/booktrain.csv")
## Warning: Missing column names filled in: 'X3' [3]
new_book <- book[!is.na(book$logtargamt),]
new_book <- subset(new_book, select = c(id, logtargamt, recency, frequency, amount, tof))
new_book$after <- ifelse(new_book$logtargamt > 0, 1, 0)
new_book$afint <- new_book$amount * new_book$frequency
new_book$rfint <- new_book$recency * new_book$frequency
fit = glm(after ~ amount + recency + frequency + tof, family = binomial, data=new_book)
fit2 = glm(after ~ amount + recency + frequency + tof +afint, family = binomial, data=new_book)
fit3 = glm(after ~ amount + recency + frequency + tof + rfint, family = binomial, data=new_book)
fit4 = glm(after ~ amount + recency + frequency + tof + afint + rfint, family = binomial, data=new_book)
summary(fit)
## 
## Call:
## glm(formula = after ~ amount + recency + frequency + tof, family = binomial, 
##     data = new_book)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7950  -0.3388  -0.2690  -0.1799   3.5454  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.475e+00  9.779e-02 -25.306  < 2e-16 ***
## amount      -2.690e-04  2.429e-04  -1.108 0.268073    
## recency     -1.291e-03  2.210e-04  -5.841 5.19e-09 ***
## frequency    4.342e-02  1.227e-02   3.539 0.000401 ***
## tof         -4.302e-04  9.657e-05  -4.455 8.38e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2750.5  on 8310  degrees of freedom
## Residual deviance: 2609.6  on 8306  degrees of freedom
## AIC: 2619.6
## 
## Number of Fisher Scoring iterations: 7
summary(fit2)
## 
## Call:
## glm(formula = after ~ amount + recency + frequency + tof + afint, 
##     family = binomial, data = new_book)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6871  -0.3399  -0.2674  -0.1781   3.5372  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.533e+00  1.043e-01 -24.295  < 2e-16 ***
## amount       2.203e-04  3.634e-04   0.606 0.544415    
## recency     -1.181e-03  2.302e-04  -5.130 2.90e-07 ***
## frequency    5.779e-02  1.573e-02   3.674 0.000239 ***
## tof         -5.316e-04  1.140e-04  -4.663 3.11e-06 ***
## afint       -1.955e-05  1.271e-05  -1.538 0.124068    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2750.5  on 8310  degrees of freedom
## Residual deviance: 2606.2  on 8305  degrees of freedom
## AIC: 2618.2
## 
## Number of Fisher Scoring iterations: 7
summary(fit3)
## 
## Call:
## glm(formula = after ~ amount + recency + frequency + tof + rfint, 
##     family = binomial, data = new_book)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7739  -0.3387  -0.2696  -0.1790   3.5613  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.465e+00  1.016e-01 -24.277  < 2e-16 ***
## amount      -2.686e-04  2.440e-04  -1.101 0.270971    
## recency     -1.339e-03  2.643e-04  -5.066 4.05e-07 ***
## frequency    4.224e-02  1.280e-02   3.301 0.000964 ***
## tof         -4.350e-04  9.757e-05  -4.459 8.25e-06 ***
## rfint        1.238e-05  3.560e-05   0.348 0.727969    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2750.5  on 8310  degrees of freedom
## Residual deviance: 2609.5  on 8305  degrees of freedom
## AIC: 2621.5
## 
## Number of Fisher Scoring iterations: 7
summary(fit4)
## 
## Call:
## glm(formula = after ~ amount + recency + frequency + tof + afint + 
##     rfint, family = binomial, data = new_book)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6881  -0.3399  -0.2673  -0.1782   3.5363  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.534e+00  1.101e-01 -23.023  < 2e-16 ***
## amount       2.212e-04  3.677e-04   0.602 0.547345    
## recency     -1.178e-03  2.806e-04  -4.198 2.70e-05 ***
## frequency    5.788e-02  1.651e-02   3.505 0.000456 ***
## tof         -5.315e-04  1.140e-04  -4.660 3.16e-06 ***
## afint       -1.958e-05  1.289e-05  -1.520 0.128576    
## rfint       -6.565e-07  3.750e-05  -0.018 0.986033    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2750.5  on 8310  degrees of freedom
## Residual deviance: 2606.2  on 8304  degrees of freedom
## AIC: 2620.2
## 
## Number of Fisher Scoring iterations: 7
new_book$ordersPer <-  new_book$frequency / new_book$tof
new_book$amountPer <- new_book$amount/ new_book$tof
new_book
fit5 = glm (after ~ amount + recency + frequency + tof + ordersPer + amountPer, family = binomial, data=new_book)
summary(fit5)
## 
## Call:
## glm(formula = after ~ amount + recency + frequency + tof + ordersPer + 
##     amountPer, family = binomial, data = new_book)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2880  -0.2921  -0.2553  -0.1922   3.3033  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.1043286  0.1236046 -25.115  < 2e-16 ***
## amount      -0.0004797  0.0002585  -1.856   0.0635 .  
## recency     -0.0009004  0.0002023  -4.451 8.57e-06 ***
## frequency    0.0492773  0.0121759   4.047 5.18e-05 ***
## tof         -0.0001787  0.0001018  -1.756   0.0791 .  
## ordersPer    3.5928284  2.7391340   1.312   0.1896    
## amountPer    0.2720166  0.1001442   2.716   0.0066 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2443.2  on 8223  degrees of freedom
## Residual deviance: 2337.1  on 8217  degrees of freedom
##   (87 observations deleted due to missingness)
## AIC: 2351.1
## 
## Number of Fisher Scoring iterations: 7
merged.data1<- merge(new_book, train, by="id")
merged.data1
fit6 = glm (after ~ recency + frequency  + ordersPer , family = binomial, data=new_book)
summary(fit6)
## 
## Call:
## glm(formula = after ~ recency + frequency + ordersPer, family = binomial, 
##     data = new_book)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6138  -0.2968  -0.2609  -0.1944   3.3160  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.1478229  0.1077500 -29.214  < 2e-16 ***
## recency     -0.0011000  0.0001874  -5.869  4.4e-09 ***
## frequency    0.0226879  0.0066679   3.403 0.000668 ***
## ordersPer    8.2243365  2.4069902   3.417 0.000633 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2443.2  on 8223  degrees of freedom
## Residual deviance: 2349.8  on 8220  degrees of freedom
##   (87 observations deleted due to missingness)
## AIC: 2357.8
## 
## Number of Fisher Scoring iterations: 7
merged <- merge(book, test, by="id")
merged <-subset(merged, select = c(id, logtargamt.x, recency, frequency, amount, tof))
merged$ordersPer <-  merged$frequency / merged$tof
merged$amountPer <- merged$amount/ merged$tof

train data for the regression

test data for predictions

myglm <- glm(after~ recency + frequency + ordersPer , data=merged.data1, family = "binomial")
score <- predict(myglm, newdata = merged, type = "response")
length(score)
## [1] 25402
merged.data1

^ which is the length of the test data.

can we somehow group amounts by year so we can see if only more recent amounts/orders are influential compared to old ones? Or perhaps they’re more likely to buy during promotion if they spent more money in the past and havent been spending a lot recently. Connect the orders csv with books.

resptraining <-merged.data1[ which (merged.data1$logtargamt.x>0), ]
resptraining
targ <- lm(logtargamt.x ~  frequency + amount + amountPer, data=resptraining)
summary(targ)
## 
## Call:
## lm(formula = logtargamt.x ~ frequency + amount + amountPer, data = resptraining)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.69757 -0.34462  0.04172  0.41559  1.61776 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.2435765  0.0538036  60.285  < 2e-16 ***
## frequency   -0.0415253  0.0070485  -5.891 1.11e-08 ***
## amount       0.0012425  0.0001835   6.770 7.73e-11 ***
## amountPer    0.1663117  0.0511670   3.250   0.0013 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6261 on 276 degrees of freedom
##   (46 observations deleted due to missingness)
## Multiple R-squared:  0.2465, Adjusted R-squared:  0.2383 
## F-statistic:  30.1 on 3 and 276 DF,  p-value: < 2.2e-16
plot(new_book$amount, new_book$frequency)

orders